Rerun FeatureCounts and Compare

Rerunning featureCounts without -O flag but still same gtf file as before to compare.

ReRunning Feature Counts

#! /bin/bash -l

#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=00:30:00 # HH/MM/SS
#SBATCH --mem=8G

mamba activate angsd

featureCounts -a /athena/angsd/scratch/cnm4001/hg38/hg38.gencodev41.gtf \
-o /athena/angsd/scratch/cnm4001/NPC_counts/NPC_combined_featureCounts.txt \
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849597.Aligned.sortedByCoord.out.bam /athena/angsd/scratch/cnm4001/NPC_bams/SRR11849598.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849599.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849600.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849601.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849602.Aligned.sortedByCoord.out.bam

Compare featureCounts results +/- -O flag

old_counts_summary <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/old_counts/NPC_combined_featureCounts.txt.summary", header = TRUE)

new_counts_summary <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/old_counts/reRun_NPC_combined_featureCounts.txt.summary", header = TRUE)
colnames(old_counts_summary) <- c("Status", seq(1:6))
colnames(new_counts_summary) <- c("Status", seq(1:6))

#show only non-zero rows
old_counts_summary[c(1,9,12,14), ]
##                     Status        1        2        3        4        5
## 1                 Assigned 23249396 23504788 23677886 23703714 23886091
## 9  Unassigned_MultiMapping 20259038 18100320 18091774 17992087 17293837
## 12   Unassigned_NoFeatures   947714  1134073   784544   984225   938354
## 14    Unassigned_Ambiguity        0        0        0        0        0
##           6
## 1  23567663
## 9  18836084
## 12   876303
## 14        0
new_counts_summary[c(1,9,12,14), ]
##                     Status        1        2        3        4        5
## 1                 Assigned  4235852  4747633  3232305  4742332  5132972
## 9  Unassigned_MultiMapping 20259038 18100320 18091774 17992087 17293837
## 12   Unassigned_NoFeatures   947714  1134073   784544   984225   938354
## 14    Unassigned_Ambiguity 19013544 18757155 20445581 18961382 18753119
##           6
## 1   3646303
## 9  18836084
## 12   876303
## 14 19921360

Seems like featureCounts also has lots of ambiguous reads. I would guess its the gtf issue.

Rerun Qorts

Download new gtf


#! /bin/bash -l

#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=download
#SBATCH --time=02:00:00 # HH/MM/SS
#SBATCH --mem=8G

cd /athena/angsd/scratch/cnm4001/hg38
wget 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gtf.gz' -O hg38.gencodev43.gtf.gz

Rerun Qorts with new gtf

#! /bin/bash -l

#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=03:00:00 # HH/MM/SS
#SBATCH --mem=20G


cd /athena/angsd/scratch/cnm4001/NPC_alignmentQC
mkdir ReRun_Qorts
cd ReRun_Qorts

for index in 597 598 599 600 601 602; do
  mkdir reRun_SRR11849${index}_alignmentQC
done

cd /athena/angsd/scratch/cnm4001/NPC_bams
mamba activate qorts

for index in 597 598 599 600 601 602; do
  qorts -Xmx18000M QC \
  --generatePlots \
  --singleEnded \
   SRR11849${index}.Aligned.sortedByCoord.out.bam \
  /athena/angsd/scratch/cnm4001/hg38/hg38.gencodev43.gtf  \
  /athena/angsd/scratch/cnm4001/NPC_alignmentQC/reRun_SRR11849${index}_alignmentQC
done

Rerun Qorts Alignments.

It appears that the gtf was the issue since rerunning Qorts with new gtf has majority of reads uniquly aligning.

Rerun featureCounts with new gtf

#! /bin/bash -l

#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=00:30:00 # HH/MM/SS
#SBATCH --mem=8G

mamba activate angsd

featureCounts -a /athena/angsd/scratch/cnm4001/hg38/hg38.gencodev43.gtf \
-o /athena/angsd/scratch/cnm4001/NPC_counts/NPC_combined_featureCounts.txt \
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849597.Aligned.sortedByCoord.out.bam /athena/angsd/scratch/cnm4001/NPC_bams/SRR11849598.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849599.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849600.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849601.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849602.Aligned.sortedByCoord.out.bam

Read in new feature counts

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
counts_matrix <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/new_counts/newGTF_NPC_combined_featureCounts.txt", header = TRUE, row.names = 1)
metadata<- read.csv("/Users/Caden/Downloads/SraRunTable-3.txt", header = TRUE)
counts_matrix <- counts_matrix[,-(1:5)]
new_sample_names <- c("Control_2", "Control_3", "Control_4", "GPE_2", "GPE_3", "GPE_4")
colnames(counts_matrix) <- new_sample_names
log_counts_matrix <-log2(counts_matrix)

DEseq object

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract
sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(counts_matrix)),
row.names = names(counts_matrix) )

#DEseq requires intergers 
counts_matrix <- round(counts_matrix, 0)
#Create DEseq object
DESeq.ds <- DESeqDataSetFromMatrix(countData = as.matrix(counts_matrix),
colData = sample_info,
design = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#reads per sample
colSums(counts(DESeq.ds))
## Control_2 Control_3 Control_4     GPE_2     GPE_3     GPE_4 
##  21347314  21655670  21627748  21783490  21977994  21565771
#remove genes with no reads
keep_genes <- rowSums(counts(DESeq.ds)) > 0
DESeq.ds <- DESeq.ds[ keep_genes, ]
dim(DESeq.ds)
## [1] 32213     6
#Size factor
DESeq.ds <- estimateSizeFactors(DESeq.ds) # calculate SFs, add them to object
plot( sizeFactors(DESeq.ds), colSums(counts(DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6 )

par(mfrow=c(1,2))
## extracting normalized counts
counts.sf_normalized <- counts(DESeq.ds, normalized=TRUE)
## adding the boxplots
boxplot(counts(DESeq.ds), main = "read counts only", cex = .6)
boxplot(counts.sf_normalized, main = "SF normalized", cex = .6)

par(mfrow=c(1,2)) # to plot the two box plots next to each other
## bp of non-normalized
boxplot(log2(counts(DESeq.ds) +1), notch=TRUE,
main = "Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## bp of size-factor normalized values
boxplot(log2(counts(DESeq.ds, normalized=TRUE) +1), notch=TRUE,
main = "Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)

## non-normalized read counts plus pseudocount
log.counts <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(DESeq.ds, "log.counts") <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## normalized read counts
assay(DESeq.ds, "log.norm.counts") <- log2(counts(DESeq.ds, normalized=TRUE) + 1)
par(mfrow=c(1,2))
DESeq.ds[, c("Control_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_3")
DESeq.ds[, c("GPE_2","GPE_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_3")

par(mfrow=c(1,3))
DESeq.ds[, c("Control_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_3")
DESeq.ds[, c("Control_2","Control_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_4")
DESeq.ds[, c("Control_3","Control_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_3 vs Control_4")

par(mfrow=c(1,3))
DESeq.ds[, c("GPE_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_3")
DESeq.ds[, c("GPE_2","GPE_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_4")
DESeq.ds[, c("GPE_3","GPE_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_3 vs GPE_4")

DESeq.rlog <- rlog(DESeq.ds, blind = FALSE) #since being treated with drug, I am trying blind
par(mfrow=c(1,2))
plot(assay(DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(DESeq.rlog)[,1:2],
cex=.1, main = "rlog transformed",
xlab = colnames(assay(DESeq.rlog[,1])),
ylab = colnames(assay(DESeq.rlog[,2])) )

PCA Explorer

library(pcaExplorer)
pcaExplorer(dds = DESeq.ds, dst = DESeq.rlog)

PCA.

heatmap.